#data manipulation
import numpy as np
import pandas as pd
#netowrk analysis
import networkx as nx
#plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
#imports for plotly interactive visualisation library
import plotly.graph_objs as go
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#plotly offline mode
init_notebook_mode(connected=True)
#filter warnings for final presentation
import warnings
warnings.filterwarnings("ignore")
#notebook formatting
from IPython.core.display import display, HTML
# read csv file
raw_asset_prices_df = pd.read_csv("IVV_historical.csv", index_col='Date')
raw_asset_prices_df = raw_asset_prices_df.iloc[:,0:]
# get number of rows and columns of the dataset
df_shape = (raw_asset_prices_df.shape)
print(f"There are {df_shape[0]} rows and {df_shape[1]} columns in the dataset")
print(f"Data timeperiod covers: {min(raw_asset_prices_df.index)} to {max(raw_asset_prices_df.index)}")
# show first five rows
raw_asset_prices_df.head()
There are 252 rows and 504 columns in the dataset Data timeperiod covers: 2020/10/1 to 2021/6/9
| AAPL | MSFT | AMZN | FB | GOOGL | GOOG | JPM | TSLA | JNJ | UNH | ... | UNM | NOV | PRGO | RL | FOX | DISCA | HFC | UAA | UA | NWS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2020/6/19 | 87.430000 | 195.149994 | 2675.010010 | 238.789993 | 1424.640015 | 1431.719971 | 97.809998 | 200.179993 | 143.830002 | 291.239990 | ... | 16.48 | 11.74 | 54.869999 | 71.389999 | 27.820000 | 22.150000 | 32.240002 | 9.51 | 8.60 | 11.88 |
| 2020/6/22 | 89.717499 | 200.570007 | 2713.820068 | 239.220001 | 1450.660034 | 1451.859985 | 96.750000 | 198.863998 | 143.389999 | 292.670013 | ... | 16.41 | 11.81 | 55.180000 | 73.339996 | 27.600000 | 21.840000 | 31.730000 | 9.51 | 8.58 | 11.99 |
| 2020/6/23 | 91.632500 | 201.910004 | 2764.409912 | 242.240005 | 1463.979980 | 1464.410034 | 97.930000 | 200.356003 | 142.860001 | 297.600006 | ... | 16.42 | 12.13 | 55.070000 | 73.459999 | 27.809999 | 22.160000 | 31.480000 | 9.86 | 8.93 | 12.07 |
| 2020/6/24 | 90.014999 | 197.839996 | 2734.399902 | 234.020004 | 1432.699951 | 1431.969971 | 94.660004 | 192.169998 | 139.820007 | 289.179993 | ... | 15.51 | 11.19 | 53.950001 | 70.650002 | 27.010000 | 21.719999 | 29.580000 | 9.47 | 8.58 | 11.69 |
| 2020/6/25 | 91.209999 | 200.339996 | 2754.580078 | 235.679993 | 1441.099976 | 1441.329956 | 97.959999 | 197.195999 | 139.669998 | 296.220001 | ... | 16.35 | 12.29 | 54.730000 | 70.320000 | 26.750000 | 21.260000 | 29.639999 | 9.60 | 8.67 | 11.77 |
5 rows × 504 columns
# create empty dataframe for log returns information
log_returns_df = pd.DataFrame()
# calculate log returns of each asset
# loop through each column in dataframe and and calculate the daily log returns
# add log returns column to new a dataframe
for col in raw_asset_prices_df.columns:
# dates are given in reverse order so need to set diff to 1.
log_returns_df[col] = np.log(raw_asset_prices_df[col]).diff(1)
#check output of log returns dataframe
log_returns_df.head()
| AAPL | MSFT | AMZN | FB | GOOGL | GOOG | JPM | TSLA | JNJ | UNH | ... | UNM | NOV | PRGO | RL | FOX | DISCA | HFC | UAA | UA | NWS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2020/6/19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020/6/22 | 0.025827 | 0.027395 | 0.014404 | 0.001799 | 0.018099 | 0.013969 | -0.010896 | -0.006596 | -0.003064 | 0.004898 | ... | -0.004257 | 0.005945 | 0.005634 | 0.026948 | -0.007939 | -0.014094 | -0.015945 | 0.000000 | -0.002328 | 0.009217 |
| 2020/6/23 | 0.021120 | 0.006659 | 0.018470 | 0.012545 | 0.009140 | 0.008607 | 0.012123 | 0.007475 | -0.003703 | 0.016705 | ... | 0.000609 | 0.026735 | -0.001995 | 0.001635 | 0.007580 | 0.014546 | -0.007910 | 0.036142 | 0.039983 | 0.006650 |
| 2020/6/24 | -0.017810 | -0.020363 | -0.010915 | -0.034522 | -0.021598 | -0.022401 | -0.033961 | -0.041715 | -0.021509 | -0.028701 | ... | -0.057015 | -0.080661 | -0.020547 | -0.039003 | -0.029188 | -0.020055 | -0.062254 | -0.040357 | -0.039983 | -0.031989 |
| 2020/6/25 | 0.013188 | 0.012557 | 0.007353 | 0.007068 | 0.005846 | 0.006515 | 0.034268 | 0.025818 | -0.001073 | 0.024053 | ... | 0.052743 | 0.093765 | 0.014354 | -0.004682 | -0.009673 | -0.021406 | 0.002026 | 0.013634 | 0.010435 | 0.006820 |
5 rows × 504 columns
#calculate correlation matrix using inbuilt pandas function
correlation_matrix = log_returns_df.corr()
#show first five rows of the correlation matrix
correlation_matrix.head()
| AAPL | MSFT | AMZN | FB | GOOGL | GOOG | JPM | TSLA | JNJ | UNH | ... | UNM | NOV | PRGO | RL | FOX | DISCA | HFC | UAA | UA | NWS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAPL | 1.000000 | 0.694270 | 0.691274 | 0.634556 | 0.522754 | 0.512635 | 0.029426 | 0.492988 | 0.220646 | 0.205826 | ... | 0.025388 | -0.016512 | 0.115766 | -0.015632 | 0.113062 | 0.024642 | 0.014591 | -0.048183 | -0.046480 | 0.174069 |
| MSFT | 0.694270 | 1.000000 | 0.741089 | 0.608624 | 0.717237 | 0.705780 | 0.040796 | 0.500609 | 0.284675 | 0.324866 | ... | -0.011970 | -0.025410 | 0.099022 | -0.058439 | 0.072394 | -0.047259 | -0.014125 | -0.063657 | -0.067575 | 0.198811 |
| AMZN | 0.691274 | 0.741089 | 1.000000 | 0.688090 | 0.622665 | 0.626068 | -0.070776 | 0.478964 | 0.109954 | 0.217196 | ... | -0.123955 | -0.089619 | -0.038887 | -0.092408 | 0.002334 | -0.070561 | -0.108025 | -0.055354 | -0.060535 | 0.113226 |
| FB | 0.634556 | 0.608624 | 0.688090 | 1.000000 | 0.658633 | 0.657932 | 0.020716 | 0.349741 | 0.187049 | 0.319301 | ... | -0.031314 | -0.046239 | 0.007847 | -0.037584 | 0.074100 | 0.007479 | -0.002316 | 0.010994 | 0.011036 | 0.176690 |
| GOOGL | 0.522754 | 0.717237 | 0.622665 | 0.658633 | 1.000000 | 0.993261 | 0.189606 | 0.377806 | 0.281629 | 0.377376 | ... | 0.127280 | 0.168447 | 0.140194 | 0.083256 | 0.154937 | 0.043162 | 0.147512 | 0.181145 | 0.186265 | 0.266175 |
5 rows × 504 columns
#visualise correlation matrix using a clustered heatmap
display(HTML("<h3>Clustered Heatmap: Correlations between asset price returns</h3>"))
sns.clustermap(correlation_matrix, cmap="RdYlGn")
plt.show()
#convert matrix to list of edges and rename the columns
edges = correlation_matrix.stack().reset_index()
edges.columns = ['asset_1','asset_2','correlation']
#remove self correlations
edges = edges.loc[edges['asset_1'] != edges['asset_2']].copy()
#show the first 5 rows of the edge list dataframe.
edges.head()
| asset_1 | asset_2 | correlation | |
|---|---|---|---|
| 1 | AAPL | MSFT | 0.694270 |
| 2 | AAPL | AMZN | 0.691274 |
| 3 | AAPL | FB | 0.634556 |
| 4 | AAPL | GOOGL | 0.522754 |
| 5 | AAPL | GOOG | 0.512635 |
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
print(nx.info(G0))
Name: Type: Graph Number of nodes: 504 Number of edges: 126756 Average degree: 503.0000
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,20))
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.circular_layout(G0),ax=ax[0,0])
ax[0,0].set_title("Circular layout")
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.random_layout(G0),ax=ax[0,1])
ax[0,1].set_title("Random layout")
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spring_layout(G0),ax=ax[1,0])
ax[1,0].set_title("Spring layout")
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spectral_layout(G0),ax=ax[1,1])
ax[1,1].set_title("Spectral layout")
plt.show()
# 'winner takes all' method - set minium correlation threshold to remove some edges from the diagram
threshold = 0.8
# create a new graph from edge list
Gx = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
# list to store edges to remove
remove = []
# loop through edges in Gx and find correlations which are below the threshold
for asset_1, asset_2 in Gx.edges():
corr = Gx[asset_1][asset_2]['correlation']
#add to remove node list if abs(corr) < threshold
if abs(corr) < threshold:
remove.append((asset_1, asset_2))
# remove edges contained in the remove list
Gx.remove_edges_from(remove)
print(str(len(remove)) + " edges removed")
126348 edges removed
def assign_colour(correlation):
if correlation <= 0:
return "#ffa09b" # red
else:
return "#9eccb7" # green
def assign_thickness(correlation, benchmark_thickness=2, scaling_factor=3):
return benchmark_thickness * abs(correlation)**scaling_factor
def assign_node_size(degree, scaling_factor=50):
return degree * scaling_factor
# assign colours to edges depending on positive or negative correlation
# assign edge thickness depending on magnitude of correlation
edge_colours = []
edge_width = []
for key, value in nx.get_edge_attributes(Gx, 'correlation').items():
edge_colours.append(assign_colour(value))
edge_width.append(assign_thickness(value))
# assign node size depending on number of connections (degree)
node_size = []
for key, value in dict(Gx.degree).items():
node_size.append(assign_node_size(value))
# draw improved graph
sns.set(rc={'figure.figsize': (9, 9)})
font_dict = {'fontsize': 18}
nx.draw(Gx, pos=nx.circular_layout(Gx), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width=edge_width)
plt.title("Asset price correlations", fontdict=font_dict)
plt.show()
# draw improved graph
nx.draw(Gx, pos=nx.fruchterman_reingold_layout(Gx), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width = edge_width)
plt.title("Asset price correlations - Fruchterman-Reingold layout",fontdict=font_dict)
plt.show()
#create minimum spanning tree layout from Gx (after small correlations have been removed)
mst = nx.minimum_spanning_tree(Gx)
edge_colours = []
#assign edge colours
for key, value in nx.get_edge_attributes(mst, 'correlation').items():
edge_colours.append(assign_colour(value))
#draw minimum spanning tree. Set node size and width to constant
nx.draw(mst, with_labels=True, pos=nx.fruchterman_reingold_layout(mst),
node_size=200, node_color="#e1575c", edge_color=edge_colours,
width = 1.2)
#set title
plt.title("Asset price correlations - Minimum Spanning Tree",fontdict=font_dict)
plt.show()
def convert_rankings_to_string(ranking):
"""
Concatenates list of node and correlation into a single string which is the
preferred format for the plotly tooltip.
Inserts html "<br>" inbetween each item in order to add a new line in the tooltip
"""
s = ''
for r in ranking:
s += r + "<br>"
return s
def calculate_stats(returns=log_returns_df):
"""calculate annualised returns and volatility for all ETFs
Output
-------
Outputs the annualised volatility and returns as a list of floats (for use in assigning node colours
and sizes) and also as a lists of formatted strings to be used in the tool tips.
"""
#log returns are additive, 252 trading days
annualized_returns = list(np.mean(returns)*252*100)
annualized_volatility = [np.std(returns[col]*100)*(252**0.5)
for col in list(returns.columns)]
# create string for tooltip
annualized_volatility_2dp = ["Annualized Volatility: ""%.1f" % r + "%" for r in annualized_volatility]
annualized_returns_2dp = ["Annualized Returns: ""%.1f" % r + "%" for r in annualized_returns]
return annualized_volatility, annualized_returns, annualized_volatility_2dp, annualized_returns_2dp
def get_top_and_bottom_three(df=correlation_matrix):
"""
get a list of the top 3 and bottom 3 most/least correlated assests
for each node.
Parameters -> df - pandas correlation matrix
Returns -> top_3_list : list of lists containing the top 3 correlations (name and value)
bottom_3_list: list of lists containing the bottom three correlations (name and value)
"""
top_3_list = []
bottom_3_list = []
for col in df.columns:
# exclude self correlation #reverse order of the list returned
top_3 = list(np.argsort(abs(df[col]))[-4:-1][::-1])
# bottom 3 list is returned in correct order
bottom_3 = list(np.argsort(abs(df[col]))[:3])
# get column index
col_index = df.columns.get_loc(col)
# find values based on index locations
top_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in top_3]
bottom_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in bottom_3]
top_3_list.append(convert_rankings_to_string(top_3_values))
bottom_3_list.append(convert_rankings_to_string(bottom_3_values))
return top_3_list, bottom_3_list
def get_coordinates(G=mst):
"""Returns the positions of nodes and edges in a format for Plotly to draw the network"""
# get list of node positions
pos = nx.fruchterman_reingold_layout(mst)
Xnodes = [pos[n][0] for n in mst.nodes()]
Ynodes = [pos[n][1] for n in mst.nodes()]
Xedges = []
Yedges = []
for e in mst.edges():
# x coordinates of the nodes defining the edge e
Xedges.extend([pos[e[0]][0], pos[e[1]][0], None])
Yedges.extend([pos[e[0]][1], pos[e[1]][1], None])
return Xnodes, Ynodes, Xedges, Yedges
# ---------------------------------------
# Get statistics for tooltip
# ---------------------------------------
# make list of node labels.
node_label = list(mst.nodes())
# calculate annualised returns, annualised volatility and round to 2dp
annual_vol, annual_ret, annual_vol_2dp, annual_ret_2dp = calculate_stats()
# get top and bottom 3 correlations for each node
top_3_corrs, bottom_3_corrs = get_top_and_bottom_three()
# create tooltip string by concatenating statistics
description = [f"<b>{node}</b>" +
"<br>" + annual_ret_2dp[index] +
"<br>" + annual_vol_2dp[index] +
"<br><br>Strongest correlations with: " +
"<br>" + top_3_corrs[index] +
"<br>Weakest correlations with: "
"<br>" + bottom_3_corrs[index]
for index, node in enumerate(node_label)]
# ---------------------------------------
# Get poisitions of nodes and edges for Plotly graph
# ---------------------------------------
# get coordinates for nodes and edges
Xnodes, Ynodes, Xedges, Yedges = get_coordinates()
# ---------------------------------------
# Assign node colour and size depending on annualised returns
# ---------------------------------------
# assign node colour depending on positive or negative annualised returns
node_colour = [assign_colour(i) for i in annual_ret]
# assign node size based on annualised returns size (scaled by a factor)
node_size = [abs(x)**0.5*5 for x in annual_ret]
# ---------------------------------------
# Plot graph
# ---------------------------------------
# edges
tracer = go.Scatter(x=Xedges, y=Yedges,
mode='lines',
line= dict(color='#DCDCDC', width=1),
hoverinfo='none',
showlegend=False)
# nodes
tracer_marker = go.Scatter(x=Xnodes, y=Ynodes,
mode='markers+text',
textposition='top center',
marker=dict(size=node_size,
line=dict(width=1),
color=node_colour),
hoverinfo='text',
hovertext=description,
text=node_label,
textfont=dict(size=7),
showlegend=False)
axis_style = dict(title='',
titlefont=dict(size=20),
showgrid=False,
zeroline=False,
showline=False,
ticks='',
showticklabels=False)
layout = dict(title='Plotly - interactive minimum spanning tree',
width=800,
height=800,
autosize=False,
showlegend=False,
xaxis=axis_style,
yaxis=axis_style,
hovermode='closest',
plot_bgcolor = '#fff')
fig = dict(data=[tracer, tracer_marker], layout=layout)
display(HTML("""<p>Node sizes are proportional to the size of annualised returns.<br>
Node colours signify positive or negative returns since beginning of the timeframe.</p> """))
iplot(fig)
Node sizes are proportional to the size of annualised returns.
Node colours signify positive or negative returns since beginning of the timeframe.
import time
# starting time
start = time.time()
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
#print out the graph info
#check number of nodes and degrees are as expected (all should have degree = 38, i.e. average degree = 38)
print(nx.info(G0))
Name: Type: Graph Number of nodes: 504 Number of edges: 126756 Average degree: 503.0000
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
G1.nodes()
NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503))
numeric_indices = [index for index in range(G1.number_of_nodes())]
node_indices = sorted([node for node in G1.nodes()])
assert numeric_indices == node_indices
from littleballoffur import PageRankBasedSampler
number_of_nodes = int(0.1*G1.number_of_nodes())
sampler = PageRankBasedSampler(number_of_nodes = number_of_nodes)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 50 Number of edges: 1225 Average degree: 49.0000
transitivity = nx.transitivity(G0)
transitivity_G1 = nx.transitivity(G1)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_G1))
Transitivity Original: 1.000000000 Transitivity Sampled: 1.000000000
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,20))
import string
mapping = dict(zip(G1, G0.nodes()))
G1 = nx.relabel_nodes(G1, mapping)
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",edge_color='#363847', pos=nx.circular_layout(G1),ax=ax[0,0])
ax[0,0].set_title("Circular layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",edge_color='#363847', pos=nx.random_layout(G1),ax=ax[0,1])
ax[0,1].set_title("Random layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spring_layout(G1),ax=ax[1,0])
ax[1,0].set_title("Spring layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spectral_layout(G1),ax=ax[1,1])
ax[1,1].set_title("Spectral layout")
plt.show()
# 'winner takes all' method - set minium correlation threshold to remove some edges from the diagram
threshold = 0.2
# create a new graph from edge list
Gx = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
# list to store edges to remove
remove = []
# loop through edges in Gx and find correlations which are below the threshold
for asset_1, asset_2 in Gx.edges():
corr = Gx[asset_1][asset_2]['correlation']
#add to remove node list if abs(corr) < threshold
if abs(corr) < threshold:
remove.append((asset_1, asset_2))
# remove edges contained in the remove list
Gx.remove_edges_from(remove)
print(str(len(remove)) + " edges removed")
print(nx.info(Gx))
46497 edges removed Name: Type: Graph Number of nodes: 504 Number of edges: 80259 Average degree: 318.4881
mapping = dict(zip(Gx, range(0, 504)))
Gy = nx.relabel_nodes(Gx, mapping)
Gy.nodes()
NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503))
numeric_indices = [index for index in range(Gy.number_of_nodes())]
node_indices = sorted([node for node in Gy.nodes()])
assert numeric_indices == node_indices
from littleballoffur import PageRankBasedSampler
number_of_nodes = int(0.1*Gy.number_of_nodes())
sampler = PageRankBasedSampler(number_of_nodes = number_of_nodes)
Gy = sampler.sample(Gy)
print(nx.info(Gy))
Name: Type: Graph Number of nodes: 50 Number of edges: 854 Average degree: 34.1600
transitivity = nx.transitivity(Gx)
transitivity_Gy = nx.transitivity(Gy)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_Gy))
Transitivity Original: 0.799585863 Transitivity Sampled: 0.830845279
def assign_colour(correlation):
if correlation <= 0:
return "#ffa09b" # red
else:
return "#9eccb7" # green
def assign_thickness(correlation, benchmark_thickness=2, scaling_factor=3):
return benchmark_thickness * abs(correlation)**scaling_factor
def assign_node_size(degree, scaling_factor=50):
return degree * scaling_factor
# assign colours to edges depending on positive or negative correlation
# assign edge thickness depending on magnitude of correlation
edge_colours = []
edge_width = []
for key, value in nx.get_edge_attributes(Gy, 'correlation').items():
edge_colours.append(assign_colour(value))
edge_width.append(assign_thickness(value))
# assign node size depending on number of connections (degree)
node_size = []
for key, value in dict(Gy.degree).items():
node_size.append(assign_node_size(value))
import string
mapping = dict(zip(Gy, Gx.nodes()))
Gy = nx.relabel_nodes(Gy, mapping) # nodes are characters a through z
Gy.nodes()
# draw improved graph
sns.set(rc={'figure.figsize': (9, 9)})
font_dict = {'fontsize': 18}
nx.draw(Gy, pos=nx.circular_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width=edge_width)
plt.title("Asset price correlations", fontdict=font_dict)
plt.show()
# draw improved graph
nx.draw(Gy, pos=nx.fruchterman_reingold_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width = edge_width)
plt.title("Asset price correlations - Fruchterman-Reingold layout",fontdict=font_dict)
plt.show()
#create minimum spanning tree layout from Gx (after small correlations have been removed)
mst = nx.minimum_spanning_tree(Gy)
edge_colours = []
#assign edge colours
for key, value in nx.get_edge_attributes(mst, 'correlation').items():
edge_colours.append(assign_colour(value))
#draw minimum spanning tree. Set node size and width to constant
nx.draw(mst, with_labels=True, pos=nx.fruchterman_reingold_layout(mst),
node_size=200, node_color="#e1575c", edge_color=edge_colours,
width = 1.2)
#set title
plt.title("Asset price correlations - Minimum Spanning Tree",fontdict=font_dict)
plt.show()
def convert_rankings_to_string(ranking):
"""
Concatenates list of node and correlation into a single string which is the
preferred format for the plotly tooltip.
Inserts html "<br>" inbetween each item in order to add a new line in the tooltip
"""
s = ''
for r in ranking:
s += r + "<br>"
return s
def calculate_stats(returns=log_returns_df):
"""calculate annualised returns and volatility for all ETFs
Output
-------
Outputs the annualised volatility and returns as a list of floats (for use in assigning node colours
and sizes) and also as a lists of formatted strings to be used in the tool tips.
"""
#log returns are additive, 252 trading days
annualized_returns = list(np.mean(returns)*252*100)
annualized_volatility = [np.std(returns[col]*100)*(252**0.5)
for col in list(returns.columns)]
# create string for tooltip
annualized_volatility_2dp = ["Annualized Volatility: ""%.1f" % r + "%" for r in annualized_volatility]
annualized_returns_2dp = ["Annualized Returns: ""%.1f" % r + "%" for r in annualized_returns]
return annualized_volatility, annualized_returns, annualized_volatility_2dp, annualized_returns_2dp
def get_top_and_bottom_three(df=correlation_matrix):
"""
get a list of the top 3 and bottom 3 most/least correlated assests
for each node.
Parameters -> df - pandas correlation matrix
Returns -> top_3_list : list of lists containing the top 3 correlations (name and value)
bottom_3_list: list of lists containing the bottom three correlations (name and value)
"""
top_3_list = []
bottom_3_list = []
for col in df.columns:
# exclude self correlation #reverse order of the list returned
top_3 = list(np.argsort(abs(df[col]))[-4:-1][::-1])
# bottom 3 list is returned in correct order
bottom_3 = list(np.argsort(abs(df[col]))[:3])
# get column index
col_index = df.columns.get_loc(col)
# find values based on index locations
top_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in top_3]
bottom_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in bottom_3]
top_3_list.append(convert_rankings_to_string(top_3_values))
bottom_3_list.append(convert_rankings_to_string(bottom_3_values))
return top_3_list, bottom_3_list
def get_coordinates(G=mst):
"""Returns the positions of nodes and edges in a format for Plotly to draw the network"""
# get list of node positions
pos = nx.fruchterman_reingold_layout(mst)
Xnodes = [pos[n][0] for n in mst.nodes()]
Ynodes = [pos[n][1] for n in mst.nodes()]
Xedges = []
Yedges = []
for e in mst.edges():
# x coordinates of the nodes defining the edge e
Xedges.extend([pos[e[0]][0], pos[e[1]][0], None])
Yedges.extend([pos[e[0]][1], pos[e[1]][1], None])
return Xnodes, Ynodes, Xedges, Yedges
# ---------------------------------------
# Get statistics for tooltip
# ---------------------------------------
# make list of node labels.
node_label = list(mst.nodes())
# calculate annualised returns, annualised volatility and round to 2dp
annual_vol, annual_ret, annual_vol_2dp, annual_ret_2dp = calculate_stats()
# get top and bottom 3 correlations for each node
top_3_corrs, bottom_3_corrs = get_top_and_bottom_three()
# create tooltip string by concatenating statistics
description = [f"<b>{node}</b>" +
"<br>" + annual_ret_2dp[index] +
"<br>" + annual_vol_2dp[index] +
"<br><br>Strongest correlations with: " +
"<br>" + top_3_corrs[index] +
"<br>Weakest correlations with: "
"<br>" + bottom_3_corrs[index]
for index, node in enumerate(node_label)]
# ---------------------------------------
# Get poisitions of nodes and edges for Plotly graph
# ---------------------------------------
# get coordinates for nodes and edges
Xnodes, Ynodes, Xedges, Yedges = get_coordinates()
# ---------------------------------------
# Assign node colour and size depending on annualised returns
# ---------------------------------------
# assign node colour depending on positive or negative annualised returns
node_colour = [assign_colour(i) for i in annual_ret]
# assign node size based on annualised returns size (scaled by a factor)
node_size = [abs(x)**0.5*5 for x in annual_ret]
# ---------------------------------------
# Plot graph
# ---------------------------------------
# edges
tracer = go.Scatter(x=Xedges, y=Yedges,
mode='lines',
line= dict(color='#DCDCDC', width=1),
hoverinfo='none',
showlegend=False)
# nodes
tracer_marker = go.Scatter(x=Xnodes, y=Ynodes,
mode='markers+text',
textposition='top center',
marker=dict(size=node_size,
line=dict(width=1),
color=node_colour),
hoverinfo='text',
hovertext=description,
text=node_label,
textfont=dict(size=7),
showlegend=False)
axis_style = dict(title='',
titlefont=dict(size=20),
showgrid=False,
zeroline=False,
showline=False,
ticks='',
showticklabels=False)
layout = dict(title='Plotly - interactive minimum spanning tree',
width=800,
height=800,
autosize=False,
showlegend=False,
xaxis=axis_style,
yaxis=axis_style,
hovermode='closest',
plot_bgcolor = '#fff')
fig = dict(data=[tracer, tracer_marker], layout=layout)
display(HTML("""<p>Node sizes are proportional to the size of annualised returns.<br>
Node colours signify positive or negative returns since beginning of the timeframe.</p> """))
iplot(fig)
Node sizes are proportional to the size of annualised returns.
Node colours signify positive or negative returns since beginning of the timeframe.
# end time
end = time.time()
# total time taken
print(f"Runtime of the program is {end - start}")
Runtime of the program is 23.665722131729126
import time
# starting time
start = time.time()
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
#print out the graph info
#check number of nodes and degrees are as expected (all should have degree = 38, i.e. average degree = 38)
print(nx.info(G0))
Name: Type: Graph Number of nodes: 504 Number of edges: 126756 Average degree: 503.0000
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
numeric_indices = [index for index in range(G1.number_of_nodes())]
node_indices = sorted([node for node in G1.nodes()])
assert numeric_indices == node_indices
from littleballoffur import HybridNodeEdgeSampler
number_of_edges = int(0.1*G1.number_of_edges())
sampler = HybridNodeEdgeSampler(number_of_edges = number_of_edges)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 504 Number of edges: 12675 Average degree: 50.2976
transitivity = nx.transitivity(G0)
transitivity_G1 = nx.transitivity(G1)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_G1))
Transitivity Original: 1.000000000 Transitivity Sampled: 0.099526849
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,20))
import string
mapping = dict(zip(G1, G0.nodes()))
G1 = nx.relabel_nodes(G1, mapping) # nodes are characters a through z
G1.nodes()
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.circular_layout(G1),ax=ax[0,0])
ax[0,0].set_title("Circular layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.random_layout(G1),ax=ax[0,1])
ax[0,1].set_title("Random layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spring_layout(G1),ax=ax[1,0])
ax[1,0].set_title("Spring layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spectral_layout(G1),ax=ax[1,1])
ax[1,1].set_title("Spectral layout")
plt.show()
# 'winner takes all' method - set minium correlation threshold to remove some edges from the diagram
threshold = 0.2
# create a new graph from edge list
Gx = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
# list to store edges to remove
remove = []
# loop through edges in Gx and find correlations which are below the threshold
for asset_1, asset_2 in Gx.edges():
corr = Gx[asset_1][asset_2]['correlation']
#add to remove node list if abs(corr) < threshold
if abs(corr) < threshold:
remove.append((asset_1, asset_2))
# remove edges contained in the remove list
Gx.remove_edges_from(remove)
print(str(len(remove)) + " edges removed")
print(nx.info(Gx))
46497 edges removed Name: Type: Graph Number of nodes: 504 Number of edges: 80259 Average degree: 318.4881
mapping = dict(zip(Gx, range(0, 504)))
Gy = nx.relabel_nodes(Gx, mapping)
numeric_indices = [index for index in range(Gy.number_of_nodes())]
node_indices = sorted([node for node in Gy.nodes()])
assert numeric_indices == node_indices
from littleballoffur import HybridNodeEdgeSampler
number_of_edges = int(0.1*Gy.number_of_edges())
sampler = HybridNodeEdgeSampler(number_of_edges = number_of_edges)
Gy = sampler.sample(Gy)
print(nx.info(Gy))
Name: Type: Graph Number of nodes: 504 Number of edges: 8025 Average degree: 31.8452
transitivity = nx.transitivity(Gx)
transitivity_Gy = nx.transitivity(Gy)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_Gy))
Transitivity Original: 0.799585863 Transitivity Sampled: 0.072984061
def assign_colour(correlation):
if correlation <= 0:
return "#ffa09b" # red
else:
return "#9eccb7" # green
def assign_thickness(correlation, benchmark_thickness=2, scaling_factor=3):
return benchmark_thickness * abs(correlation)**scaling_factor
def assign_node_size(degree, scaling_factor=50):
return degree * scaling_factor
# assign colours to edges depending on positive or negative correlation
# assign edge thickness depending on magnitude of correlation
edge_colours = []
edge_width = []
for key, value in nx.get_edge_attributes(Gy, 'correlation').items():
edge_colours.append(assign_colour(value))
edge_width.append(assign_thickness(value))
# assign node size depending on number of connections (degree)
node_size = []
for key, value in dict(Gy.degree).items():
node_size.append(assign_node_size(value))
import string
mapping = dict(zip(Gy, Gx.nodes()))
Gy = nx.relabel_nodes(Gy, mapping) # nodes are characters a through z
Gy.nodes()
# draw improved graph
sns.set(rc={'figure.figsize': (9, 9)})
font_dict = {'fontsize': 18}
nx.draw(Gy, pos=nx.circular_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width=edge_width)
plt.title("Asset price correlations", fontdict=font_dict)
plt.show()
# draw improved graph
nx.draw(Gy, pos=nx.fruchterman_reingold_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width = edge_width)
plt.title("Asset price correlations - Fruchterman-Reingold layout",fontdict=font_dict)
plt.show()
#create minimum spanning tree layout from Gx (after small correlations have been removed)
mst = nx.minimum_spanning_tree(Gy)
edge_colours = []
#assign edge colours
for key, value in nx.get_edge_attributes(mst, 'correlation').items():
edge_colours.append(assign_colour(value))
#draw minimum spanning tree. Set node size and width to constant
nx.draw(mst, with_labels=True, pos=nx.fruchterman_reingold_layout(mst),
node_size=200, node_color="#e1575c", edge_color=edge_colours,
width = 1.2)
#set title
plt.title("Asset price correlations - Minimum Spanning Tree",fontdict=font_dict)
plt.show()
def convert_rankings_to_string(ranking):
"""
Concatenates list of node and correlation into a single string which is the
preferred format for the plotly tooltip.
Inserts html "<br>" inbetween each item in order to add a new line in the tooltip
"""
s = ''
for r in ranking:
s += r + "<br>"
return s
def calculate_stats(returns=log_returns_df):
"""calculate annualised returns and volatility for all ETFs
Output
-------
Outputs the annualised volatility and returns as a list of floats (for use in assigning node colours
and sizes) and also as a lists of formatted strings to be used in the tool tips.
"""
#log returns are additive, 252 trading days
annualized_returns = list(np.mean(returns)*252*100)
annualized_volatility = [np.std(returns[col]*100)*(252**0.5)
for col in list(returns.columns)]
# create string for tooltip
annualized_volatility_2dp = ["Annualized Volatility: ""%.1f" % r + "%" for r in annualized_volatility]
annualized_returns_2dp = ["Annualized Returns: ""%.1f" % r + "%" for r in annualized_returns]
return annualized_volatility, annualized_returns, annualized_volatility_2dp, annualized_returns_2dp
def get_top_and_bottom_three(df=correlation_matrix):
"""
get a list of the top 3 and bottom 3 most/least correlated assests
for each node.
Parameters -> df - pandas correlation matrix
Returns -> top_3_list : list of lists containing the top 3 correlations (name and value)
bottom_3_list: list of lists containing the bottom three correlations (name and value)
"""
top_3_list = []
bottom_3_list = []
for col in df.columns:
# exclude self correlation #reverse order of the list returned
top_3 = list(np.argsort(abs(df[col]))[-4:-1][::-1])
# bottom 3 list is returned in correct order
bottom_3 = list(np.argsort(abs(df[col]))[:3])
# get column index
col_index = df.columns.get_loc(col)
# find values based on index locations
top_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in top_3]
bottom_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in bottom_3]
top_3_list.append(convert_rankings_to_string(top_3_values))
bottom_3_list.append(convert_rankings_to_string(bottom_3_values))
return top_3_list, bottom_3_list
def get_coordinates(G=mst):
"""Returns the positions of nodes and edges in a format for Plotly to draw the network"""
# get list of node positions
pos = nx.fruchterman_reingold_layout(mst)
Xnodes = [pos[n][0] for n in mst.nodes()]
Ynodes = [pos[n][1] for n in mst.nodes()]
Xedges = []
Yedges = []
for e in mst.edges():
# x coordinates of the nodes defining the edge e
Xedges.extend([pos[e[0]][0], pos[e[1]][0], None])
Yedges.extend([pos[e[0]][1], pos[e[1]][1], None])
return Xnodes, Ynodes, Xedges, Yedges
# ---------------------------------------
# Get statistics for tooltip
# ---------------------------------------
# make list of node labels.
node_label = list(mst.nodes())
# calculate annualised returns, annualised volatility and round to 2dp
annual_vol, annual_ret, annual_vol_2dp, annual_ret_2dp = calculate_stats()
# get top and bottom 3 correlations for each node
top_3_corrs, bottom_3_corrs = get_top_and_bottom_three()
# create tooltip string by concatenating statistics
description = [f"<b>{node}</b>" +
"<br>" + annual_ret_2dp[index] +
"<br>" + annual_vol_2dp[index] +
"<br><br>Strongest correlations with: " +
"<br>" + top_3_corrs[index] +
"<br>Weakest correlations with: "
"<br>" + bottom_3_corrs[index]
for index, node in enumerate(node_label)]
# ---------------------------------------
# Get poisitions of nodes and edges for Plotly graph
# ---------------------------------------
# get coordinates for nodes and edges
Xnodes, Ynodes, Xedges, Yedges = get_coordinates()
# ---------------------------------------
# Assign node colour and size depending on annualised returns
# ---------------------------------------
# assign node colour depending on positive or negative annualised returns
node_colour = [assign_colour(i) for i in annual_ret]
# assign node size based on annualised returns size (scaled by a factor)
node_size = [abs(x)**0.5*5 for x in annual_ret]
# ---------------------------------------
# Plot graph
# ---------------------------------------
# edges
tracer = go.Scatter(x=Xedges, y=Yedges,
mode='lines',
line= dict(color='#DCDCDC', width=1),
hoverinfo='none',
showlegend=False)
# nodes
tracer_marker = go.Scatter(x=Xnodes, y=Ynodes,
mode='markers+text',
textposition='top center',
marker=dict(size=node_size,
line=dict(width=1),
color=node_colour),
hoverinfo='text',
hovertext=description,
text=node_label,
textfont=dict(size=7),
showlegend=False)
axis_style = dict(title='',
titlefont=dict(size=20),
showgrid=False,
zeroline=False,
showline=False,
ticks='',
showticklabels=False)
layout = dict(title='Plotly - interactive minimum spanning tree',
width=800,
height=800,
autosize=False,
showlegend=False,
xaxis=axis_style,
yaxis=axis_style,
hovermode='closest',
plot_bgcolor = '#fff')
fig = dict(data=[tracer, tracer_marker], layout=layout)
display(HTML("""<p>Node sizes are proportional to the size of annualised returns.<br>
Node colours signify positive or negative returns since beginning of the timeframe.</p> """))
iplot(fig)
Node sizes are proportional to the size of annualised returns.
Node colours signify positive or negative returns since beginning of the timeframe.
# end time
end = time.time()
# total time taken
print(f"Runtime of the program is {end - start}")
Runtime of the program is 33.38518309593201
import time
# starting time
start = time.time()
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
#print out the graph info
#check number of nodes and degrees are as expected (all should have degree = 38, i.e. average degree = 38)
print(nx.info(G0))
Name: Type: Graph Number of nodes: 504 Number of edges: 126756 Average degree: 503.0000
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
numeric_indices = [index for index in range(G1.number_of_nodes())]
node_indices = sorted([node for node in G1.nodes()])
assert numeric_indices == node_indices
from littleballoffur import MetropolisHastingsRandomWalkSampler
number_of_nodes = int(0.5*G1.number_of_nodes())
sampler = MetropolisHastingsRandomWalkSampler(number_of_nodes = number_of_nodes)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 252 Number of edges: 31626 Average degree: 251.0000
transitivity = nx.transitivity(G0)
transitivity_G1 = nx.transitivity(G1)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_G1))
Transitivity Original: 1.000000000 Transitivity Sampled: 1.000000000
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,20))
import string
mapping = dict(zip(G1, G0.nodes()))
G1 = nx.relabel_nodes(G1, mapping) # nodes are characters a through z
G1.nodes()
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.circular_layout(G1),ax=ax[0,0])
ax[0,0].set_title("Circular layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.random_layout(G1),ax=ax[0,1])
ax[0,1].set_title("Random layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spring_layout(G1),ax=ax[1,0])
ax[1,0].set_title("Spring layout")
nx.draw(G1, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.spectral_layout(G1),ax=ax[1,1])
ax[1,1].set_title("Spectral layout")
plt.show()
# 'winner takes all' method - set minium correlation threshold to remove some edges from the diagram
threshold = 0.2
# create a new graph from edge list
Gx = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
# list to store edges to remove
remove = []
# loop through edges in Gx and find correlations which are below the threshold
for asset_1, asset_2 in Gx.edges():
corr = Gx[asset_1][asset_2]['correlation']
#add to remove node list if abs(corr) < threshold
if abs(corr) < threshold:
remove.append((asset_1, asset_2))
# remove edges contained in the remove list
Gx.remove_edges_from(remove)
print(str(len(remove)) + " edges removed")
print(nx.info(Gx))
46497 edges removed Name: Type: Graph Number of nodes: 504 Number of edges: 80259 Average degree: 318.4881
mapping = dict(zip(Gx, range(0, 504)))
Gy = nx.relabel_nodes(Gx, mapping)
numeric_indices = [index for index in range(Gy.number_of_nodes())]
node_indices = sorted([node for node in Gy.nodes()])
assert numeric_indices == node_indices
from littleballoffur import MetropolisHastingsRandomWalkSampler
number_of_nodes = int(0.5*Gy.number_of_nodes())
sampler = MetropolisHastingsRandomWalkSampler(number_of_nodes = number_of_nodes)
Gy = sampler.sample(Gy)
print(nx.info(Gy))
Name: Type: Graph Number of nodes: 252 Number of edges: 22271 Average degree: 176.7540
transitivity = nx.transitivity(Gx)
transitivity_Gy = nx.transitivity(Gy)
print('Transitivity Original: {:.9f}'.format(transitivity))
print('Transitivity Sampled: {:.9f}'.format(transitivity_Gy))
Transitivity Original: 0.799585863 Transitivity Sampled: 0.842046607
def assign_colour(correlation):
if correlation <= 0:
return "#ffa09b" # red
else:
return "#9eccb7" # green
def assign_thickness(correlation, benchmark_thickness=2, scaling_factor=3):
return benchmark_thickness * abs(correlation)**scaling_factor
def assign_node_size(degree, scaling_factor=50):
return degree * scaling_factor
# assign colours to edges depending on positive or negative correlation
# assign edge thickness depending on magnitude of correlation
edge_colours = []
edge_width = []
for key, value in nx.get_edge_attributes(Gy, 'correlation').items():
edge_colours.append(assign_colour(value))
edge_width.append(assign_thickness(value))
# assign node size depending on number of connections (degree)
node_size = []
for key, value in dict(Gy.degree).items():
node_size.append(assign_node_size(value))
import string
mapping = dict(zip(Gy, Gx.nodes()))
Gy = nx.relabel_nodes(Gy, mapping) # nodes are characters a through z
Gy.nodes()
# draw improved graph
sns.set(rc={'figure.figsize': (9, 9)})
font_dict = {'fontsize': 18}
nx.draw(Gy, pos=nx.circular_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width=edge_width)
plt.title("Asset price correlations", fontdict=font_dict)
plt.show()
# draw improved graph
nx.draw(Gy, pos=nx.fruchterman_reingold_layout(Gy), with_labels=True,
node_size=node_size, node_color="#e1575c", edge_color=edge_colours,
width = edge_width)
plt.title("Asset price correlations - Fruchterman-Reingold layout",fontdict=font_dict)
plt.show()
#create minimum spanning tree layout from Gx (after small correlations have been removed)
mst = nx.minimum_spanning_tree(Gy)
edge_colours = []
#assign edge colours
for key, value in nx.get_edge_attributes(mst, 'correlation').items():
edge_colours.append(assign_colour(value))
#draw minimum spanning tree. Set node size and width to constant
nx.draw(mst, with_labels=True, pos=nx.fruchterman_reingold_layout(mst),
node_size=200, node_color="#e1575c", edge_color=edge_colours,
width = 1.2)
#set title
plt.title("Asset price correlations - Minimum Spanning Tree",fontdict=font_dict)
plt.show()
def convert_rankings_to_string(ranking):
"""
Concatenates list of node and correlation into a single string which is the
preferred format for the plotly tooltip.
Inserts html "<br>" inbetween each item in order to add a new line in the tooltip
"""
s = ''
for r in ranking:
s += r + "<br>"
return s
def calculate_stats(returns=log_returns_df):
"""calculate annualised returns and volatility for all ETFs
Output
-------
Outputs the annualised volatility and returns as a list of floats (for use in assigning node colours
and sizes) and also as a lists of formatted strings to be used in the tool tips.
"""
#log returns are additive, 252 trading days
annualized_returns = list(np.mean(returns)*252*100)
annualized_volatility = [np.std(returns[col]*100)*(252**0.5)
for col in list(returns.columns)]
# create string for tooltip
annualized_volatility_2dp = ["Annualized Volatility: ""%.1f" % r + "%" for r in annualized_volatility]
annualized_returns_2dp = ["Annualized Returns: ""%.1f" % r + "%" for r in annualized_returns]
return annualized_volatility, annualized_returns, annualized_volatility_2dp, annualized_returns_2dp
def get_top_and_bottom_three(df=correlation_matrix):
"""
get a list of the top 3 and bottom 3 most/least correlated assests
for each node.
Parameters -> df - pandas correlation matrix
Returns -> top_3_list : list of lists containing the top 3 correlations (name and value)
bottom_3_list: list of lists containing the bottom three correlations (name and value)
"""
top_3_list = []
bottom_3_list = []
for col in df.columns:
# exclude self correlation #reverse order of the list returned
top_3 = list(np.argsort(abs(df[col]))[-4:-1][::-1])
# bottom 3 list is returned in correct order
bottom_3 = list(np.argsort(abs(df[col]))[:3])
# get column index
col_index = df.columns.get_loc(col)
# find values based on index locations
top_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in top_3]
bottom_3_values = [df.index[x] + ": %.2f" %
df.iloc[x, col_index] for x in bottom_3]
top_3_list.append(convert_rankings_to_string(top_3_values))
bottom_3_list.append(convert_rankings_to_string(bottom_3_values))
return top_3_list, bottom_3_list
def get_coordinates(G=mst):
"""Returns the positions of nodes and edges in a format for Plotly to draw the network"""
# get list of node positions
pos = nx.fruchterman_reingold_layout(mst)
Xnodes = [pos[n][0] for n in mst.nodes()]
Ynodes = [pos[n][1] for n in mst.nodes()]
Xedges = []
Yedges = []
for e in mst.edges():
# x coordinates of the nodes defining the edge e
Xedges.extend([pos[e[0]][0], pos[e[1]][0], None])
Yedges.extend([pos[e[0]][1], pos[e[1]][1], None])
return Xnodes, Ynodes, Xedges, Yedges
# ---------------------------------------
# Get statistics for tooltip
# ---------------------------------------
# make list of node labels.
node_label = list(mst.nodes())
# calculate annualised returns, annualised volatility and round to 2dp
annual_vol, annual_ret, annual_vol_2dp, annual_ret_2dp = calculate_stats()
# get top and bottom 3 correlations for each node
top_3_corrs, bottom_3_corrs = get_top_and_bottom_three()
# create tooltip string by concatenating statistics
description = [f"<b>{node}</b>" +
"<br>" + annual_ret_2dp[index] +
"<br>" + annual_vol_2dp[index] +
"<br><br>Strongest correlations with: " +
"<br>" + top_3_corrs[index] +
"<br>Weakest correlations with: "
"<br>" + bottom_3_corrs[index]
for index, node in enumerate(node_label)]
# ---------------------------------------
# Get poisitions of nodes and edges for Plotly graph
# ---------------------------------------
# get coordinates for nodes and edges
Xnodes, Ynodes, Xedges, Yedges = get_coordinates()
# ---------------------------------------
# Assign node colour and size depending on annualised returns
# ---------------------------------------
# assign node colour depending on positive or negative annualised returns
node_colour = [assign_colour(i) for i in annual_ret]
# assign node size based on annualised returns size (scaled by a factor)
node_size = [abs(x)**0.5*5 for x in annual_ret]
# ---------------------------------------
# Plot graph
# ---------------------------------------
# edges
tracer = go.Scatter(x=Xedges, y=Yedges,
mode='lines',
line= dict(color='#DCDCDC', width=1),
hoverinfo='none',
showlegend=False)
# nodes
tracer_marker = go.Scatter(x=Xnodes, y=Ynodes,
mode='markers+text',
textposition='top center',
marker=dict(size=node_size,
line=dict(width=1),
color=node_colour),
hoverinfo='text',
hovertext=description,
text=node_label,
textfont=dict(size=7),
showlegend=False)
axis_style = dict(title='',
titlefont=dict(size=20),
showgrid=False,
zeroline=False,
showline=False,
ticks='',
showticklabels=False)
layout = dict(title='Plotly - interactive minimum spanning tree',
width=800,
height=800,
autosize=False,
showlegend=False,
xaxis=axis_style,
yaxis=axis_style,
hovermode='closest',
plot_bgcolor = '#fff')
fig = dict(data=[tracer, tracer_marker], layout=layout)
display(HTML("""<p>Node sizes are proportional to the size of annualised returns.<br>
Node colours signify positive or negative returns since beginning of the timeframe.</p> """))
iplot(fig)
Node sizes are proportional to the size of annualised returns.
Node colours signify positive or negative returns since beginning of the timeframe.
from littleballoffur import PageRankBasedSampler
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
number_of_nodes = int(0.1*G1.number_of_nodes())
sampler = PageRankBasedSampler(number_of_nodes = number_of_nodes)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 50 Number of edges: 1225 Average degree: 49.0000
# mapping number to each ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# assume that the total money that we invest initially is 10,000 dollars
total_money = 10000
# calculate the total number of stocks that we are going to invest
stock_num = len(G1.nodes())
# calculate the money that we invest in each stock
per_price = total_money / stock_num
# calculate the number of shares that we bought for each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value on 2021/6/18
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the equal weight portfolio of node sampling: ', finally_price)
Final value of the equal weight portfolio of node sampling: 14145.988121660277
# variance for each stock
returns = raw_asset_prices_df.pct_change()
var_dict = dict(returns.var())
# mapping number to each ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# assume that the total money that we invest initially is 10,000 dollars
total_money = 10000
# calculate the total number of stocks that we are going to invest
stock_num = len(G1.nodes())
"""
calculate the money that we invest in each stock
"""
# the sum of each variance
var_sum = 0
for i in list(G1.nodes()):
var_sum = var_sum + var_dict[num_to_id[i]]
per_price = dict()
for i in list(G1.nodes()):
per_price[num_to_id[i]] = total_money * (var_dict[num_to_id[i]]/var_sum)
# calculate the number of shares that we bought for each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price[num_to_id[i]] / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value on 2021/6/18
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the variance weight portfolio of node sampling:', finally_price)
Final value of the variance weight portfolio of node sampling: 16325.806898268831
from littleballoffur import HybridNodeEdgeSampler
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
number_of_edges = int(0.1*G1.number_of_edges())
sampler = HybridNodeEdgeSampler(number_of_edges = number_of_edges)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 504 Number of edges: 12675 Average degree: 50.2976
# mapping number to ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# initial value
total_money = 10000
# number of stocks
stock_num = len(G1.nodes())
# money invested in each stock
per_price = total_money / stock_num
# number of shares that we bought for each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value of the portfolio on 2021/6/18
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the equal weight portfolio of edge sampling:', finally_price)
Final value of the equal weight portfolio of edge sampling: 14458.31207578887
# variance of each stock
var_dict = dict(returns.var())
# mapping number to ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# initial value
total_money = 10000
# number of stocks
stock_num = len(G1.nodes())
"""
caulculate the money invested in each stock
"""
# total variance
var_sum = 0
for i in list(G1.nodes()):
var_sum = var_sum + var_dict[num_to_id[i]]
per_price = dict()
for i in list(G1.nodes()):
per_price[num_to_id[i]] = total_money * (var_dict[num_to_id[i]]/var_sum)
# number of shares of each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price[num_to_id[i]] / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value on 2021/06/08
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the variance weight portfolio of edge sampling:', finally_price)
Final value of the variance weight portfolio of edge sampling: 15900.5416194285
from littleballoffur import MetropolisHastingsRandomWalkSampler
mapping = dict(zip(G0, range(0, 504)))
G1 = nx.relabel_nodes(G0, mapping)
number_of_nodes = int(0.5*G1.number_of_nodes())
sampler = MetropolisHastingsRandomWalkSampler(number_of_nodes = number_of_nodes)
G1 = sampler.sample(G1)
print(nx.info(G1))
Name: Type: Graph Number of nodes: 252 Number of edges: 31626 Average degree: 251.0000
# mapping number to ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# initial value
total_money = 10000
# number of stocks
stock_num = len(G1.nodes())
# money invested in each stock
per_price = total_money / stock_num
# number of shares of each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value on 2021/06/08
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the equal weight portfolio of exploration sampling:', finally_price)
Final value of the equal weight portfolio of exploration sampling: 14649.059209291358
# variance of each stock
var_dict = dict(returns.var())
# mapping number to ID
num_to_id = dict()
for item,values in mapping.items():
num_to_id[values] = item
# initial value
total_money = 10000
# number of stocks
stock_num = len(G1.nodes())
"""
money invested in each stock
"""
# total variance
var_sum = 0
for i in list(G1.nodes()):
var_sum = var_sum + var_dict[num_to_id[i]]
per_price = dict()
for i in list(G1.nodes()):
per_price[num_to_id[i]] = total_money * (var_dict[num_to_id[i]]/var_sum)
# number of shares of each stock
save_per_stock_num = dict()
for i in list(G1.nodes()):
save_per_stock_num[num_to_id[i]] = per_price[num_to_id[i]] / raw_asset_prices_df[num_to_id[i]]['2020/6/19']
# final value on 2021/06/08
finally_price = 0
for item,value in save_per_stock_num.items():
finally_price = finally_price + raw_asset_prices_df[item]['2021/6/18'] * value
print('Final value of the variance weight portfolio of exploration sampling:', finally_price)
Final value of the variance weight portfolio of exploration sampling: 16271.27130227263